import jsonlines

from datasets import load_dataset
from tqdm import tqdm
import sys
idx = sys.argv[1]
contexts = set()
raw_dataset = []
with jsonlines.open(f"7M_label_filter_{idx}_all.jsonl") as f:
    for line in f:
        raw_dataset.append(line)


with jsonlines.open(f"7M_label_{idx}.jsonl") as f:
    for line in tqdm(f):
        if len(line["label"]["ability"])==0:
            continue
        contexts.add(str(line["id"])+line["conversations"][-1]["value"][:100])

datas = []
for d in raw_dataset:
    if str(d["id"])+d["conversations"][-1]["value"][:100] not in contexts:
        datas.append(d)

with jsonlines.open(f"7M_label_filter_{idx}.jsonl","w") as wf:
    for line in datas:
        wf.write(line)



